####################################### DATA ANALYSIS WITH R ####################################### # ## NUMERICAL SUMMARY # possum<-read.table("possum.txt",header=TRUE) attach(possum) head(possum, n=3) # display the first three rows str(possum) # get information on each of the columns dim(possum) # 104 obs, 14 variables possum[!complete.cases(possum),] # which rows have missing values in the data range(earconch) # min and max values of ear conch length mean(earconch) # sample mean mean(earconch,trim=0.95) # trimmed mean (the first 5% smaller and 5% larger # observations are omitted in computing # the mean, robust to the presence of outliers) mean(footlgth) # [1] NA, there is a not available value mean(footlgth,na.rm=T) # omit NA values from the computation median(earconch) # median var(earconch) # sample variance sum((earconch-mean(earconch))^2)/(length(earconch)-1) # the same sqrt(var(earconch)); # standard deviation sd(earconch) # the same quart<-quantile(earconch,probs=c(0.25,0.75)) # quartiles quart sd(earconch);0.75*(quart[2]-quart[1]) # s~0.75H suggests normality summary(earconch) # range, mean, median and quartiles detach(possum) summary(possum) # see what generates for 'Pop' and 'sex' # ## GRAPHICAL SUMMARY # # ## Histograms and density plots # possum.f<-possum[possum$sex=="f",] # create a subset data frame with female possums # same as 'possum[which(possum$sex=="f"),]' head(possum.f,n=3) # equivalent to data frame 'fossum' dim(possum.f) # 43 obs attach(possum.f) hist(totlngth, probability=TRUE) # histogram of total length of female possums (relative frequency and not absolute) hist(totlngth, prob=TRUE,breaks=10) # change the number of breakpoints # "prob=TRUE" works as well as "probability=TRUE" # how to change the breakpoints range(totlngth) # min=75.0 max=96.5 par(mfrow=c(2,1)) # to get a 2 by 1 layout hist(totlngth, breaks=72.5+(0:5)*5, # A: specify breaks at 72.5,77.5,... prob=TRUE,xlab="Total length (cm)", # put legends on x-axis main="A: breaks at 72.5,77.5,...") # put a title #mtext("A: breaks at 72.5,77.5,...",side=3) # see the difference with main="" hist(totlngth, breaks=75+(0:5)*5, # B: specify breaks at 75,80,... prob=TRUE,xlab="Total length (cm)", main="B: breaks at 75,80,...") #mtext("A: breaks at 72.5,77.5,...",line=3,side=3) # see the difference with main="" # plot A suggests that the distribution is # asymmetric (skewed to the left), # while plot B suggests symmetry dens<-density(totlngth) # density estimate, depends on bandwidth parameter x.lim<-range(dens$x);y.lim<-range(dens$y) # to be used as arguments of 'hist' hist(totlngth, breaks=72.5+(0:5)*5, # C: same as histogram A prob=TRUE,xlim=x.lim,ylim=y.lim, xlab="Total length (cm)", main="C: breaks as in A") lines(dens) # add density plot hist(totlngth, breaks=75+(0:5)*5, # D: same as histogram B prob=TRUE,xlim=x.lim,ylim=y.lim, xlab="Total length (cm)", main="D: breaks as in B") lines(dens) # add density plot # a density plot is more reliable in evaluating the # shape of the distribution par(mfrow=c(1,1)) # back to 1 by 1 layout detach(possum.f) # LATTICE style density plot (OPTIONAL) # useful for providing graphs that allow ready # comparison between grouping in the data library(lattice) trellis.device(color=FALSE) # open the device, no color densityplot(~earconch | Pop, groups=sex, # density plot that compares 'earconch' of data=possum, pch=c(1,4), # male and female possums conditioned auto.key=list(columns=2,points=TRUE), # on the levels of the variable Pop (factor) aspect=1) # auto.key is for the legend, aspect controls # physical aspect ratio of the panels #trellis.device(color=FALSE) # try this, less options specified #densityplot(~earconch | Pop,groups=sex,data=possum) help(densityplot) # see the help if interested # ## Box plots # #possum.f<-subset(possum, sex=="f") old.par <- par(no.readonly = TRUE) # all graphical parameters settings # that could be changed par(fig=c(0, 1, 0, 0.5)) # coordinates of the figure region in the display # region of the device: 0 to 50% of # height of figure region boxplot(possum.f$totlngth, horizontal=TRUE, # box plot of total length of female possums xlab="Total length (cm)", # horizontal=FALSE is the default main="Box plot") rug(possum.f$totlngth) # add data points on the x-axis par(fig=c(0, 1, 0.5, 1),new=TRUE) # 50% to 100% of height of figure region, # specify 'new=TRUE' since we use the same device # (counterintuitive indeed, see 'help(par)') possum.m<-possum[possum$sex=="m",] # create a subset data frame with male possums boxplot(possum.m$totlngth, horizontal=TRUE, # box plot of total length of male possums xlab="Total length (cm)", main="Box plot") rug(possum.m$totlngth) # add data points on the x-axis par(fig=c(0, 1, 0.2, 0.8)) # 20% to 80% of height of figure region boxplot(totlngth~sex, data=possum, # data values are split into groups according horizontal=TRUE, # to the grouping variable sex (a factor) xlab="Total length (cm)", # display one box plot for each group main="Box plots by sex") # labels on y-axis taken from the levels of sex rug(possum.f$totlngth,side=1) # add data points on the x-axis rug(possum.m$totlngth,side=3) # add data points on the top-horizontal axis #boxplot(split(possum$totlngth,possum$sex), # do the same # horizontal=TRUE) # 'split' divides an object according to levels in # a factor, no legend on the x-axis x<-possum.f$totlngth y<-possum.m$totlngth par(fig=c(0.2, 0.8, 0, 1)) # 20% to 80% of width of figure region boxplot(x,y,xlab="Total length (cm)", # the same as before, but box plots now main="Box plot by sex") # are displayed vertically (default) rug(x,side=2);rug(y,side=4) # add data points on the axes # no legend on the y-axis par(old.par) # restore the previous graphical settings plot( x,rep(0,length(x)),ylim=c(0,1), # a simple alternative to display xlab="Total length (cm)",ylab="") # the two sub samples points( y,rep(1,length(y)) ) #plot(c(x,y),rep(0:1,c(length(x),length(y))), # do the same more concisely # xlab="Total length (cm)",ylab="") plot(as.numeric(sex)~totlngth,data=possum) # do the same, points at 1 and 2 this time unclass(possum$sex) # remember how a factor is coded in R # ## PROBABILITY DISTRIBUTIONS # rbinom(10,size=1,prob=.5) # 10 Bernoulli trials round( dpois(0:4,lambda=3), 2) # Prob(X=x), x=0,1,2,3,4 X~Pois(3) round( ppois(0:4,lambda=3), 2) # Prob(X<=n), X~Pois(3) (CUMULATIVE DISTRIBUTION FUNCTION) round( cumsum( dpois(0:4,lambda=3) ), 2) # the same y<- rexp(50,rate=2) # 50 variates from exponential variable # of rate 2, Exp(2) (by default 'rate=1') summary(y) # mean approximatively equal to 1/rate hist(y,prob=TRUE,breaks=10, # histogram of y main="Histogram of exponential variates")# 'breaks' used for specifying the number of bins z<-seq(0,3,length=31) # 31 equaly spaced points in [0,3] lines(z,dexp(z,2)) # add the density of Exp(2) 1-pnorm(1.644) # Prob(X>1.644), X~Norm(0,1) qexp(0.75,2) # q: Prob(X<=q)=0.75, X~Exp(2) # ## Normal Distribution # x<- rnorm(100) # 100 variates from standard normal Norm(0,1) # (by default 'mean=0' and 'sd=1') summary(x) # mean approximatively equal to 1, quart<-summary(x)[c(2,5)] # quartiles sd(x);0.75*(quart[2]-quart[1]) # s~0.75H suggests normality hist(x,prob=TRUE) # histogram, shoul look symmetric and bell-shaped # PLOT THE DENSITY FUNCTION N(0,1) z<-seq(-3,3,length=31) # 31 equally spaced points in [-3,3] ht<-dnorm(z) # density function evaluated a these points plot(z,ht,type="l",xlab="",ylab="Density", # plot the normal density, in the range (-3,3) main="Norm(0,1)",yaxs="i") # 'yaxs="i"' locates the axes at the limits # of the data hist(x,prob=TRUE,ylim=range(ht), # specify y-axis range to include the density main="Histogram of normal variates") lines(z,ht) # add the normal density to the histogram # Additional examples pnorm(0) # 0.5, exactly half the area under the density # is to the left of the mean pnorm(1) # .841 pnorm(-1.96) # .025 pnorm(1.96) # .975 pnorm(1.96, mean=2) # .484 for a normal distribution with mean=2, sd=1 pnorm(1.96, sd=2) # .836 for a normal distribution with mean=0, sd=2 qnorm(.9) # 90th percentile; mean=0 and sd=1 qnorm(0.841) # 1.0 qnorm(0.5) # 0 qnorm(0.975) # 1.96 qnorm(c(.1,.2,.3)) # -1.282 -0.842 -0.524, # i.e. 10th, 20th and 30th percentiles qnorm(.1, mean=100, sd=10) # 87.2 10th percentile, mean=100, sd=10 #pdf("fig.pdf",paper="special", # save the plot in a pdf file # height=6,width=6) old.par <- par(no.readonly = TRUE) #OPTIONAL PLOT par(mar=c(4.1,4.1,1.1,1.1), mgp=c(2.5,0.5,0), cex.axis=0.85) z <- seq(-3,3,length=101) plot(z, dnorm(z), type="l", xlab="",ylab="", yaxs="i", bty="L") title(xlab=list("z",cex=1.5), ylab=list("normal density",cex=1.5)) polygon(c(z[z <= 1.0],1.0),c(dnorm(z[z <= 1.0]), dnorm(-3)), col="grey") chh <- par()$cxy[2] arrows(-1.8, 0.32, -0.25, 0.2, length=0.07, xpd=T) cump <- round(pnorm(1), 3) text(-1.8, 0.32+0.75*chh, paste("pnorm(1)\n", "=", cump), xpd=T, cex=1.2) par(old.par) # restore the previous graphical settings #dev.off() # ## Poisson Distribution # k<-0:15 # [1] 0 1 ... 14 15 l<-5 # Poisson rate=5 (also mean of the distribution) p<-dpois(k,lambda=l) # exact probabilities at k round(p,3) #pdf("fig.pdf",paper="special", # save the plot in a pdf file # height=6,width=6) plot(k,p,type="h",lwd=5,ylim=c(0,0.25), # ' type="h" ' for histogram like vertical lines xlab="",ylab="",main="") # specify axes legends and title later y.string<-expression( # string to be used as ylab paste("Poisson prob ",lambda==5) # the text is mathematical expression formatted ) # according to TEX-like rules ('==' is the syntax # for equals) see 'help(plotmath)' title(xlab=list("k",cex=1.5), # 'title' takes lists as arguments ylab=list(y.string,cex=1.5)) # specify the font size by 'cex' #dev.off() # Histograms from simulated data for n=10,100,500,1000 # use of a loop + conditional evaluation + add a legend to an existing plot kfin<-17 # upper limit for the distribution n<-c(10,100,500,1000) # sample sizes y.string<-expression(paste("poisson prob ", lambda==5)) t.string<-paste("n=",n) # 'paste' is used to create four different titles # with the sample size specified k<-0:15;l<-5;p<-dpois(k,lambda=l) plot(k,p,type="h",lwd=5,ylim=c(0,0.25), # histogram-like plot of the true xlab="",ylab="",main="") # distribution Pois(5) j=1 title(main=list(t.string[j],cex=1.4), # 't.string[j]' contains "n=n[j]", the title xlab=list("k",cex=1.5), # x-axis legend "k" ylab=list(y.string,cex=1.5)) # y-axis legend "poisson prob lamda=5" r<-rpois(n[j],lambda=5) # simulate n[j] Poisson variates of rate 5 out<-hist(r,breaks=c(-1:kfin),plot=F) # store the output of the calling 'hist' # we want the relative frequency at 0:17 # so we specify the breaks as -1:17, # then we recover # the component 'density' from the list 'out' for(i in 0:kfin) { # loop for plotting the vertival lines at each # value in 0:17 with eight = relative frequency if(out$density[i+1]>0) { # check that the relative frequency is non null lines(c(i+0.2,i+0.2), # if so, plot the vertical lines c(0,out$density[i+1]),lwd=5,col=2) # a little to the right } } legend(8,.2,c("true prob.","histogram"), # add a legend lty=c(1,1),col=c(1,2),lwd=c(5,5)) old.par <- par(no.readonly = TRUE) par(mfrow=c(2,2)) # to get a 2 by 2 layout with the 4 plot for # n=10, 100, 500, 1000 for(j in 1:4) { # external loop for drawing the four plots plot(k,p,type="h",lwd=5,ylim=c(0,0.25),xlab="",ylab="",main="") title(main=list(t.string[j],cex=1.4),xlab=list("k",cex=1.5),ylab=list(y.string,cex=1.5)) r<-rpois(n[j],lambda=5) out<-hist(r,breaks=c(-1:kfin),plot=F) for(i in 0:kfin) { # internal loop if(out$density[i+1]>0) { lines(c(i+0.2,i+0.2),c(0,out$density[i+1]),lwd=5,col=2) } } # close internal loop legend(8,.2,c("true prob.","histogram"),lty=c(1,1),col=c(1,2),lwd=c(5,5)) } # close external loop par(old.par) # restore the previous graphical settings # i.e. back to 1 by 1 layout # ## Iteration # x<- (1:4000)*(pi/40); # evaluate the sin of each element of 'x' y<-rep(0,length=length(x)) # and store them into the vector 'y' for(i in 1:length(x)) { # loop, i serves as index for y[] and x[] y[i]<-sin(x[i]) } y for(i in 1:length(x)) y[i]<-sin(x[i]) # the same (without {...}, only 1 command) y y<- sin(x) # the same but faster, try 400000 instead of 4000 y # ## Conditional Evaluation # # simple example of 'if' statement x<-rpois(1,5) # x is a value from Pois(5) if (x>3) { y<-1 z<-2 # (y,z)=(1,2) if x>3 } else { y<-2 z<-1 } # (y,z)=(1,2) if x<=3 x;y;z x<-rpois(1,5) # example of 'if' statements that are nested count<-0 # count=1 if 22) { # count=-1 if x<=2 or x>=5 if (x<5) { count<-count+1 } else count<-count-1 } x;count count<-0 # this leads a syntax error: if (x>2) # there is a new line between the command if (x<5) count<-count+1 # in the 'if'-part and the 'else'-part else { count<-count-1 } count<-0 # this is ok if (x>2) # no {} are used to delimit branches if (x<5) count<-count+1 else count<-count-1 x;count x<-rpois(1,5) count<-0 # this is ok if (x>2) { # here we use {} to delimit branches if (x<5) { count<-count+1 } else { count<-count-1 } } x;count x<-rpois(1,5) count<-0 # compound conditions if (x>2 & x<5){ # does the same as before count<-count+1 } else count<-count-1 x;count # ## Random Number Generation # set.seed(1) runif(10) set.seed(1) runif(10) # the same runif(10) # different # ## QQ Plot # y<- rexp(50,2) ppoints(y) # sequence of ìprobabilityî points # (1:m - a)/(m + (1-a)-a) where m is length(x) seq(0,1,length=length(y)) # note the difference, this are all equi-spaced plot(qexp(ppoints(y),1),sort(y)) # plot the empirical quantiles against the abline(0,1,col=2) # theoretical quantiles # data consistent with the theoretical distribution # should have points falling on the straight line qqplot(qexp(ppoints(y),1),y) # the same, no need to sort y abline(0,1,col=2) qqplot(qexp(ppoints(y),2),y) abline(0,1,col=2) # in fact y was generated with rate=2 x<- rnorm(50) qqnorm(x) # R built-in function for comparing against abline(0,1,col=2) # normal distribution (mean=0, sd=1) #qqplot(qnorm(ppoints(x)),x) # the same points(qnorm(ppoints(x)),sort(x), pch=16,col=2) # the same x<- rnorm(50,mean=1,sd=1) # mean different from zero qqnorm(x) # points not on the straight line abline(0,1,col=2) qqnorm(x-mean(x)) # subtract the empirical mean abline(0,1,col=2) # ok qqplot(x,y) # comparing two samples abline(0,1,col=2) # they clearly follow different distribution boxplot(x,y) # boxplot plot(c(x,y),rep(0:1,c(length(x),length(y))), # other way for comparing the two samples xlab="",ylab="") # Correlation x1 <- x2 <- x3 <- (11:30)/5 # create artificial data for x1, x2, x3 y1 <- x1 + rnorm(20)/2 # create artificial data for y1 y2 <- 2 - 0.05*x1 + 0.1*((x1 - 1.75))^4 + # create artificial data for y2 1.25 * rnorm(20) y3 <- (x1 - 3.85)^2 + 0.015 + rnorm(20)/4 # create artificial data for y3 theta <- ((2 * pi) * (1:20))/20 x4 <- 10 + 4 * cos(theta) # create artificial data for x4 y4 <- 10 + 4 * sin(theta) + (0.5 * rnorm(20)) # create artificial data for y4 x<-x1;y<-y1 #x<-x2;y<-y2 #x<-x3;y<-y3 #x<-x4;y<-y4 r <- round(cor(x, y), 3) # Pearson correlation rho <- round(cor(rank(x), rank(y)), 3) # Spearman rank correlation print(c(r, rho)) plot(x, y, pch = 16) # scatterplot cov(x,y) # sample covariance (same as 'var(x,y)') var(cbind(x,y)) # sample variance and covariances cor(x,y) # Pearson correlation cor(cbind(x,y)) # 2 by 2 correlation cov(x,y)/(sd(x)*sd(y)); cor(x,y) # relation among 'cov','cor' and 'sd' # log transformation library(MASS) # data frame Animals (MASS) attach(Animals) # Correlation between body and brain: cor(body, brain) # Pearson correlation cor(body, brain,method="spearman") # Spearman rank correlation plot(body, brain, pch = 16) # scatterplot plot(body, brain, log="xy",pch = 16) # scatterplot at the logarithmic scale plot(log(body), log(brain),pch = 16) # the same with original axes's scale detach(Animals)